Overview/Goals

Using time-course RNASeq data from Ophiocordyceps camponoti-floridani liquid culture:

  • build a circadian gene co-expression network (GCN),
  • annotate the network
    • identify modules containing rhy24 genes
    • identify modules containing differentially rhythmic genes (Ocflo v. Beau)
    • identify modules containing differentially expressed genes (inf v. control)

Step 1: Build circadian GCN

1.1 Load data

Dataset: O. camponoti-floridani grown in liquid culture as blastspore (three pooled per time point for RNA-extraction and -sequencing), collected every 2h, over a 24h-period. [Control]

# loading database which contains data for Das and de Bekker 2022, from GitHub
db <- dbConnect(RSQLite::SQLite(), paste0(path_to_repo,"/data/databases/TC6_fungal_data.db"))

# specify sample name
sample.name <- "ophio_cflo"

# extract the (gene-expr X time-point) data
dat <-
  db %>%
  tbl(., paste0(sample.name ,"_fpkm")) %>%
  select(gene_name, everything()) %>%
  collect()

writeLines("What is the dimensions of the original dataset? [Rows = #genes, Cols = #samples]")
## What is the dimensions of the original dataset? [Rows = #genes, Cols = #samples]
dim(dat[-1])
## [1] 7455   12

1.2 Clean data

The above dataset contains all genes ??(n=13,808)?? (7455 genes, 12 samples) in the fungi genome. However, not all of these genes are expressed, and some are expressed at very low levels that are not biologically meaningful. Therefore, we will only keep the genes that are “expressed” (≥1 FPKM for at least half of all time points).

# Which genes are expressed throughout the day in Ophio-cflo?
  # count the number of time points that has ≥ 1 FPKM
  n.expressed <- apply(dat[-1], 1, function(x) sum(x >= 1))
  # subset the data and only keep the filtered genes
  dat <- dat[which(n.expressed >=6),]

writeLines("Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]")
## Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]
dim(dat)
## [1] 6874   13

This is our cleaned, input data file for building the circadian GCN. This contains 6874 genes in ??13?? samples.

1.3 Format data

  • Log2 transform the data
datExpr = as.data.frame(t(log2(dat[-c(1)]+1)))
names(datExpr) = dat$gene_name
rownames(datExpr) = names(dat)[-c(1)]

# USE THE FOLLOWING CODE TO CHECK IF YOU HAVE ANY BAD SAMPLES #
  # gsg = goodSamplesGenes(datExpr, verbose = 3);
  # gsg$allOK

  # sampleTree = hclust(dist(datExpr0), method = "average");
  # # Plot the sample tree: Open a graphic output window of size 12 by 9 inches
  # # The user should change the dimensions if the window is too large or too small.
  # sizeGrWindow(12,9)
  # #pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
  # par(cex = 1);
  # par(mar = c(0,4,2,0))
  # plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
  #      cex.axis = 1.5, cex.main = 2)

# save the number of genes and samples
# that will be used to create the circadian GCN
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

# visualize the log-transformed data
x = reshape2::melt(as.matrix(t(datExpr)))
colnames(x) = c('gene_id', 'sample', 'value')

writeLines("Visualizing the log-transformed data")
## Visualizing the log-transformed data
ggplot(x, aes(x=value, color=sample)) + geom_density() + theme_Publication()

1.4 Calculate gene-gene similarity

# Calculate Kendall's tau-b correlation for each gene-gene pair

# sim_matrix <- cor((datExpr), method = "kendall") # this step takes time
# save(sim_matrix, file = paste0(path_to_repo, "/results/temp_files/sim_matrix_", sample.name, "_TC6.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/temp_files/sim_matrix_", sample.name, "_TC6.RData")) # load it up

## Let's display a chunk of the matrix (code from Hughitt 2016; github)
heatmap_indices <- sample(nrow(sim_matrix), 200)

writeLines(paste0("Plotting a chunk of the gene-gene similarity matrix with ", length(heatmap_indices), " genes."))
## Plotting a chunk of the gene-gene similarity matrix with 200 genes.
gplots::heatmap.2(t(sim_matrix[heatmap_indices, heatmap_indices]),
          col=inferno(100),
          labRow=NA, labCol=NA,
          trace='none', dendrogram='row',
          xlab='Gene', ylab='Gene',
          main= paste0("Similarity matrix \n correlation method = 'kendall' \n (", length(heatmap_indices), "random genes)"),
          density.info='none', revC=TRUE)

1.5 Create adjacency matrix

  • To create the adjacency matrix, we need to first identify the soft-thresholding power (see WGCNA for more info).
writeLines("Performing network topology analysis to pick soft-thresholding power")
## Performing network topology analysis to pick soft-thresholding power
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=30, by=2))
# # Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 6508.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 6508 of 6874
##    ..working on genes 6509 through 6874 of 6874
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.5520  2.470          0.966 2430.00   2420.00 3430.0
## 2      2   0.0959  0.373          0.927 1220.00   1170.00 2210.0
## 3      3   0.1380 -0.330          0.904  716.00    656.00 1590.0
## 4      4   0.4970 -0.679          0.932  462.00    401.00 1210.0
## 5      5   0.6770 -0.910          0.940  318.00    260.00  952.0
## 6      6   0.7660 -1.030          0.953  229.00    175.00  772.0
## 7      7   0.8130 -1.130          0.958  170.00    123.00  638.0
## 8      8   0.8460 -1.190          0.968  130.00     89.00  536.0
## 9      9   0.8610 -1.240          0.969  102.00     66.00  458.0
## 10    10   0.8700 -1.300          0.968   81.40     49.70  396.0
## 11    12   0.8760 -1.380          0.965   54.00     29.50  304.0
## 12    14   0.8630 -1.440          0.950   37.60     18.40  240.0
## 13    16   0.8740 -1.480          0.957   27.20     12.00  194.0
## 14    18   0.8790 -1.500          0.963   20.20      8.17  159.0
## 15    20   0.8930 -1.510          0.971   15.40      5.73  132.0
## 16    22   0.9100 -1.510          0.981   12.00      4.08  112.0
## 17    24   0.9200 -1.510          0.988    9.47      2.98   95.8
## 18    26   0.9180 -1.520          0.986    7.60      2.21   82.7
## 19    28   0.9100 -1.550          0.984    6.18      1.65   72.3
## 20    30   0.9110 -1.570          0.985    5.09      1.26   64.3
writeLines("Plotting the resutls from the network topology analysis")
## Plotting the resutls from the network topology analysis
# Plot the results:
# sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

NOTE: The scale-free topology fit index reaches ~0.846 at a soft-thresholding-power=8, and it gets saturated after that. R2 is used to quentify how well a scale-free netork satisfies, ranging from 0 (non-scale free topology) to 1 (scale free topology). Networks in a biological contest do resemble scale-free topology. When we maximize for scale-free model fit, there is a natural trade-off between mean number of connections. We want to pick a soft thresshold that satifies the scale-free network while maintaning the highest connectivity.

Now, we can go ahead and create our adjacency matrix by power-transforming the similarity matrix (see WGCNA for more info).

## Specify the soft-thresholding-power which has been set at 8 for Ophio-cflo
soft.power = 8

# Construct adjacency matrix
# adj_matrix <- adjacency.fromSimilarity(sim_matrix,
#                                        power=soft.power,
#                                        type='signed')
# save(adj_matrix, file = paste0(path_to_repo, "/results/temp_files/adj_matrix_", sample.name, "_TC6.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo,"/results/temp_files/adj_matrix_", sample.name,"_TC6.RData")) # load it up


# Convert adj_matrix to matrix
gene_ids <- rownames(adj_matrix)

adj_matrix <- matrix(adj_matrix, nrow=nrow(adj_matrix))
rownames(adj_matrix) <- gene_ids
colnames(adj_matrix) <- gene_ids

writeLines(paste0("Plotting the power-transformed adjacency matrix for the same ", length(heatmap_indices)," genes as above"))
## Plotting the power-transformed adjacency matrix for the same 200 genes as above
## Same heatmap as before, but now with the power-transformed adjacency matrix
gplots::heatmap.2(t(adj_matrix[heatmap_indices, heatmap_indices]),
                  col=inferno(100),
                  labRow=NA, labCol=NA,
                  trace='none', dendrogram='row',
                  xlab='Gene', ylab='Gene',
                  main='Adjacency matrix',
                  density.info='none', revC=TRUE)

## Delete similarity matrix to free up memory
rm(sim_matrix)
# gc()

Step 2: Identify gene clusters

The following steps are performed as per guidelines from the WGCNA package and several tutorials made available online.

2.1 Create topological overalp matrix

# Turn adjacency into topological overlap
# TOM = TOMsimilarity(adj_matrix);
# dissTOM = 1-TOM
# save(dissTOM, file = paste0(path_to_repo, "/results/temp_files/dissTOM_", sample.name, "_TC6.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/temp_files/dissTOM_", sample.name, "_TC6.RData")) # load it up

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average")

writeLines("Plotting the resulting clustering tree (dendrogram)")
## Plotting the resulting clustering tree (dendrogram)
# sizeGrWindow(12,9)
par(mfrow=c(1,1))
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

2.2 Identify clusters

User defined parameters:

  • minimum size (number of genes) of modules | var-name: minModuleSize
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 50;

# Module identification using dynamic tree cut:
dynamicMods= cutreeDynamic(dendro = geneTree,
                           distM = dissTOM,
                           method = "hybrid",
                           verbose = 4,
                           deepSplit = 3, # see WGCNA for more info on tuning parameters
                           pamRespectsDendro = FALSE,
                           minClusterSize = minModuleSize);
##  ..cutHeight not given, setting it to 0.988  ===>  99% of the (truncated) height range in dendro.
##  ..Going through the merge tree
##  
##  ..Going through detected branches and marking clusters..
##  ..Assigning Tree Cut stage labels..
##  ..Assigning PAM stage labels..
##  ....assigned 3086 objects to existing clusters.
##  ..done.
# view number of genes in each module
table(dynamicMods)
## dynamicMods
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1169  865  568  419  361  316  308  292  291  284  276  163  144  137  135  104 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  101   98   92   88   80   79   75   74   68   65   65   54   53   50
writeLines("How many genes are there in each of the initial modules (clusters) detected?
Note: The names of the modules (colors) have no meaning.")
## How many genes are there in each of the initial modules (clusters) detected?
## Note: The names of the modules (colors) have no meaning.
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
##         black          blue         brown          cyan     darkgreen 
##           308           865           568           137            79 
##      darkgrey    darkorange       darkred darkturquoise         green 
##            74            65            80            75           361 
##   greenyellow        grey60     lightcyan    lightgreen   lightyellow 
##           276           101           104            98            92 
##       magenta  midnightblue        orange          pink        purple 
##           291           135            68           292           284 
##           red     royalblue   saddlebrown        salmon       skyblue 
##           316            88            53           144            54 
##     steelblue           tan     turquoise         white        yellow 
##            50           163          1169            65           419

2.3 Merge similar modules

User defined parameters:

  • minimum correlation between two modules above which they are merged into one | var-name: MEDissThres
# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes

# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs, method = "kendall");

writeLines("Clustering the module eigengenes and identifying a cutoff to merge similar modules")
## Clustering the module eigengenes and identifying a cutoff to merge similar modules
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
# sizeGrWindow(7, 8)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "MEDiss = 1-cor(MEs, method = 'kendall')")

# We choose a height cut of 0.3
MEDissThres = 0.3 # user-specified parameter value; see WGCNA manual for more info

# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")

writeLines(paste0("Merging modules that have a correlation ≥ ", 1-MEDissThres))
## Merging modules that have a correlation ≥ 0.7
# Call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
##  mergeCloseModules: Merging modules whose distance is less than 0.3
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 30 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 18 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 16 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 16 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;

writeLines("Plotting the identified clusters (denoted with colors) before and after merging.")
## Plotting the identified clusters (denoted with colors) before and after merging.
# sizeGrWindow(12, 9)
plotDendroAndColors(geneTree,
                    cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

# Rename to moduleColors
moduleColors = mergedColors

# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1

2.4 Calculate module-module similarity

writeLines("Calculating module-module similarity based on module-eigengene-expression.")
## Calculating module-module similarity based on module-eigengene-expression.
# Calculate similarity of the eigen-genes
sim_matrix_ME <- cor(mergedMEs, method = "kendall")

# calculate adj_matrix
adj_matrix_ME <- adjacency.fromSimilarity(sim_matrix_ME,
                                          power=1, # DO NOT power transform
                                          type='signed'
)

## CHANGE THE NAMES OF THE MODULES;
module_ids <- data.frame(
  old_labels = rownames(adj_matrix_ME) %>% str_split("ME", 2) %>% sapply("[", 2) %>% as.character(),
  new_labels = paste0("OC", 1:nrow(adj_matrix_ME)))

# coerce into a matrix
adj_matrix_ME <- matrix(adj_matrix_ME, nrow=nrow(adj_matrix_ME))
rownames(adj_matrix_ME) <- module_ids$new_labels
colnames(adj_matrix_ME) <- module_ids$new_labels

writeLines("Plotting the adjacency matrix that shows module-module similarity in expression")
## Plotting the adjacency matrix that shows module-module similarity in expression
gplots::heatmap.2(t(adj_matrix_ME),
                  col=inferno(100),
                  # labRow=NA, labCol=NA,
                  trace='none', dendrogram='row',
                  xlab='', ylab='',
                  # main='Similarity matrix - MEs \n correlation method = "kendall")',
                  main='Adjacency matrix - MEs \n modified edge weights)',
                  density.info='none', revC=TRUE)

2.5 Visualize the network

pacman::p_load(igraph)

adj_matrix_ME_igraph <- adj_matrix_ME

# get rid of low correlations (0.6 & 0.8 are arbitrary) [0.7 and 0.9]
adj_matrix_ME_igraph[adj_matrix_ME_igraph < 0.6] <- 0
adj_matrix_ME_igraph[adj_matrix_ME_igraph < 0.8 & adj_matrix_ME_igraph>0] <- 0.5
adj_matrix_ME_igraph[adj_matrix_ME_igraph >= 0.8] <- 1

# build_network
network <- graph.adjacency(adj_matrix_ME_igraph,
                           mode = "upper",
                           weighted = T,
                           diag = F)

# simplify network
network <- igraph::simplify(network)  # removes self-loops

# remove isolated vertices (keep only the nodes)
isolated <- which(degree(network)==0)
network <- igraph::delete.vertices(network, isolated)

# E(network)$width <- E(network)$weight + min(E(network)$weight) + 1 # offset=1

# colors <- as.character(module_ids$old_labels)
# V(network)$color <- colors
V(network)$color <- "white"

# genes_ME <- factor(moduleColors, levels=colors) %>% summary()
V(network)$size <- igraph::degree(network, mode = "all")^1.75
# V(network)$size <- log2(genes_ME)^1.3

V(network)$label.color <- "black"
V(network)$frame.color <- "black"

E(network)$width <- E(network)$weight^2*4
E(network)$color <- "black"

# ## highlight shortest paths between two vetices
# short.path <- igraph::get.shortest.paths(network, "S_5", "S_15")
# E(network, path = unlist(short.path[[1]]))$color <- col.scheme[2]
# E(network, path = unlist(short.path[[1]]))$width <- E(network)$weight*8

writeLines("Visualizing a simplified representation of the circadian GCN, with and without labels")
## Visualizing a simplified representation of the circadian GCN, with and without labels
par(mfrow = c(1,1))

## Circular layout
# png(paste0(path_to_repo, "/results/figures/", sample.name[1], "/", script.name,"/", sample.name[1],"_GCN_1.png"),
#     width = 20, height = 30, units = "cm", res = 1000)
# par(bg=NA)
plot(network,
     layout=layout.kamada.kawai,
       # layout=layout.fruchterman.reingold,
       # layout=layout.graphopt,
       # layout=layout_in_circle,
     vertex.label=NA
     # vertex.size=hub.score(network)$vector*30
     # vertex.shape="none"
)

# dev.off()

# png(paste0(path_to_repo, "/results/figures/", sample.name[1], "/", script.name,"/", sample.name[1],"_GCN_2.png"),
#     width = 20, height = 30, units = "cm", res = 600)
# par(bg=NA)
plot(network,
     size=20,
     layout=layout.kamada.kawai,
       # layout=layout.fruchterman.reingold
       # layout=layout.graphopt
       # layout=layout_in_circle,
     # vertex.label=NA
     # vertex.size=hub.score(network)$vector*30
     vertex.shape="none"
)

# dev.off()
par(mfrow = c(1,1))

Step 3: Annotate the network

list of module x genes

# Make a list that returns gene names for a given cluster
module_genes <- list()

modules.to.exclude <- c("")
# modules.to.exclude <- c(paste0("OC",c(4,6,9,12:16)))
which.modules <- module_ids %>% filter(!new_labels %in% modules.to.exclude) %>% pull(old_labels)
which.labels <- module_ids %>% filter(!new_labels %in% modules.to.exclude) %>% pull(new_labels)

# Get the genes from each of the modules
for (i in 1:length(which.modules)) {
  
  # which color
  mod.color = as.character(which.modules[[i]])
  
  # subset
  module_genes[[i]] <- names(datExpr)[which(moduleColors==mod.color)]
  names(module_genes)[[i]] <- as.character(which.labels[[i]])
}
# # check the result | works
# names(module_genes)
# module_genes['OC12']

# [13 Dec 2021]
# Save a csv with the module identity information for all genes used in building the GCN
# make a dataframe with gene_name and module_identity
for (i in 1:length(module_genes)){
  if (i == 1){
    ocflo.control.mods <- data.frame(gene_name = module_genes[[i]],
                                    module_identity = as.character(names(module_genes)[i]))
  }
  else{
   foo <- data.frame(gene_name = module_genes[[i]],
                                  module_identity = as.character(names(module_genes)[i]))
    ocflo.control.mods <- rbind(ocflo.control.mods, foo)
  }

}

# # save the dataframe as a csv
# cflo.control.mods %>% 
#   left_join(module_ids, by = c("module_identity" = "new_labels")) %>% 
#   write.csv(.,
#             paste0(path_to_repo,
#                    "/results/WGCNA/cflo/cflo_heads_control_module_identity_new_labels.csv"),
#             row.names = F)
# # done.

# # save a copy with all the gene annotations
# # load Cflo gene annotations
# cflo_annots <- read.csv(paste0(path_to_repo,"/functions/func_data/cflo_annots.csv"),
#                             header=T, stringsAsFactors = F, na.strings = c(NA,""," "))
# cflo.control.mods %>%
#   left_join(cflo_annots, by="gene_name") %>% head()
#   write.csv(.,
#             paste0(path_to_repo,
#                    "/results/WGCNA/cflo/cflo_heads_control_module_identity_new_labels_annots.csv"),
#             row.names = F)

3.1 Load genes of interest

# load the list of all genes of interest
load(file = paste0(path_to_repo,"/results/genes_of_interest/goi_list.RData"))

# sapply(goi.list, class)

The_comparison

Full comparison

writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
## #####################################################
## How many genes are in each of my geneset of interest?
## #####################################################
## MAKE YOUR LIST OF GENES OF INTEREST ##

# LIST ONE - WGCNA modules
list1 <- module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
## List of interesting genes #1
## ----------------------------
## Genes in each of the identified gene-clusters or modules
sapply(list1, length)
##  OC1  OC2  OC3  OC4  OC5  OC6  OC7  OC8  OC9 OC10 OC11 OC12 OC13 OC14 OC15 OC16 
## 1144 1316  317   68  241  226  857  361  144  173 1169  370   74   50  276   88
## LIST TWO - all genes of interst
list2 <- list(gois.tc6[[1]],
              gois.tc6[[2]],
              
              gois.tc6[[3]],
              gois.tc6[[4]],
              
              gois.tc6[[5]],
              gois.tc6[[6]],
              gois.tc6[[7]],
              gois.tc6[[8]],
              gois.tc6[[9]],
              gois.tc6[[10]],
              gois.tc6[[11]],
              gois.tc6[[12]]
              )
writeLines("List of interesting genes #2
----------------------------")
## List of interesting genes #2
## ----------------------------
names(list2) <- c("rhy24-Ocflo",
                  "rhy24-Ocflo-Okim",
                  
                  "rhy24-Ocflo|Beau-cluster3",
                  "rhy24-Ocflo|Beau-cluster4",
                  
                  "Ocflo-inf-UP-all",
                  "Ocflo-inf-DOWN-all",
                  "Ocflo-inf-manip-UP",
                  "Ocflo-inf-manip-DOWN",
                  "Ocflo-DEGs-inf-only",
                  "Ocflo-inf-DOWN-manip-UP",
                  "Ocflo-manip-UP-all",
                  "Ocflo-manip-DOWN-all"
                  )
sapply(list2, length)
##               rhy24-Ocflo          rhy24-Ocflo-Okim rhy24-Ocflo|Beau-cluster3 
##                      2285                       357                       451 
## rhy24-Ocflo|Beau-cluster4          Ocflo-inf-UP-all        Ocflo-inf-DOWN-all 
##                       324                       318                       395 
##        Ocflo-inf-manip-UP      Ocflo-inf-manip-DOWN       Ocflo-DEGs-inf-only 
##                       190                       161                       362 
##   Ocflo-inf-DOWN-manip-UP        Ocflo-manip-UP-all      Ocflo-manip-DOWN-all 
##                       116                      1867                      1625
## CHECK FOR OVERLAP

## make a GOM object
gom.1v2 <- newGOM(list1, list2,
       genome.size = nGenes)
png(paste0(path_to_repo, "/results/figures/", sample.name[1],"_gom_1v2.png"),
    width = 50, height = 30, units = "cm", res = 300)
drawHeatmap(gom.1v2,
              adj.p=T,
              cutoff=0.05,
              what="odds.ratio",
              # what="Jaccard",
              log.scale = T,
              note.col = "grey80")
trash <- dev.off()

 # writeLines("How many genes exactly are overlapping between the pairwise comparisons")
# getMatrix(gom.1v4, name = "intersection") %>% t()

writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
## Visualizing the significant overlaps between your lists of interesting genes and the identified modules
Where are my genesets of interests?

Where are my genesets of interests?

Focused comparison

writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
## #####################################################
## How many genes are in each of my geneset of interest?
## #####################################################
## MAKE YOUR LIST OF GENES OF INTEREST ##

# LIST ONE - WGCNA modules
list1 <- module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
## List of interesting genes #1
## ----------------------------
## Genes in each of the identified gene-clusters or modules
sapply(list1, length)
##  OC1  OC2  OC3  OC4  OC5  OC6  OC7  OC8  OC9 OC10 OC11 OC12 OC13 OC14 OC15 OC16 
## 1144 1316  317   68  241  226  857  361  144  173 1169  370   74   50  276   88
## LIST THREE - focused genes of interest
list3 <- list(gois.tc6[[1]],
              gois.tc6[[2]],
              
              gois.tc6[[3]],
              gois.tc6[[4]],
              
              # gois.tc6[[5]],
              # gois.tc6[[6]],
              gois.tc6[[7]],
              gois.tc6[[8]],
              gois.tc6[[9]],
              gois.tc6[[10]]
              # gois.tc6[[11]],
              # gois.tc6[[12]]
              )
writeLines("List of interesting genes #2
----------------------------")
## List of interesting genes #2
## ----------------------------
names(list3) <- c("rhy24-Ocflo",
                  "rhy24-Ocflo-Okim",
                  
                  "rhy24-Ocflo|Beau-cluster3",
                  "rhy24-Ocflo|Beau-cluster4",
                  
                  # "Ocflo-inf-UP-all",
                  # "Ocflo-inf-DOWN-all",
                  "Ocflo-inf-manip-UP",
                  "Ocflo-inf-manip-DOWN",
                  "Ocflo-DEGs-inf-only",
                  "Ocflo-inf-DOWN-manip-UP"
                  # "Ocflo-manip-UP-all",
                  # "Ocflo-manip-DOWN-all"
                  )
sapply(list3, length)
##               rhy24-Ocflo          rhy24-Ocflo-Okim rhy24-Ocflo|Beau-cluster3 
##                      2285                       357                       451 
## rhy24-Ocflo|Beau-cluster4        Ocflo-inf-manip-UP      Ocflo-inf-manip-DOWN 
##                       324                       190                       161 
##       Ocflo-DEGs-inf-only   Ocflo-inf-DOWN-manip-UP 
##                       362                       116
## CHECK FOR OVERLAP

## make a GOM object
gom.1v3 <- newGOM(list1, list3,
       genome.size = nGenes)
png(paste0(path_to_repo, "/results/figures/", sample.name[1],"_gom_1v3.png"),
    width = 30, height = 30, units = "cm", res = 300)
drawHeatmap(gom.1v3,
              adj.p=T,
              cutoff=0.05,
              what="odds.ratio",
              # what="Jaccard",
              log.scale = T,
              note.col = "grey80")
trash <- dev.off()

 # writeLines("How many genes exactly are overlapping between the pairwise comparisons")
# getMatrix(gom.1v4, name = "intersection") %>% t()

writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
## Visualizing the significant overlaps between your lists of interesting genes and the identified modules
Where are my genesets of interests?

Where are my genesets of interests?

Step 4: Network statistics

From WGCNA-tutorial

5.1 Intramodular connectivity

“We begin by calculating the intramodular connectivity for each gene. (In network literature, connectivity is often referred to as”degree”.) The function intramodularConnectivity computes:

  • the whole network connectivity kTotal,
  • the within (intra)module connectivity kWithin,
  • the extra-modular connectivity kOut=kTotal-kWithin, and
  • the difference of the intra- and extra-modular connectivities kDiff=kIn-kOut=2*kIN-kTotal
# From what I can tell, colorh1 in the tutorial refers to moduleColors
colorh1 <- moduleColors

# Calculate the connectivities of the genes
Alldegrees1=intramodularConnectivity(adjMat = adj_matrix, colors = colorh1)
Alldegrees1 <- 
  Alldegrees1 %>% 
    rownames_to_column("gene_name") %>% 
    mutate_at(vars(starts_with("k")), 
              function(x){
                round(x,2)
                })
head(Alldegrees1)
##      gene_name kTotal kWithin   kOut   kDiff
## 1 GQ602_000001 257.04  163.01  94.02   68.99
## 2 GQ602_000002 135.90   59.67  76.23  -16.56
## 3 GQ602_000003 175.23   32.82 142.41 -109.59
## 4 GQ602_000004  81.25   11.45  69.80  -58.36
## 5 GQ602_000005 104.26   15.93  88.33  -72.40
## 6 GQ602_000006  96.75   40.82  55.93  -15.11

Plotting the mean (± 95% CI) connectivity of genes in different modules

pd <- position_dodge(0.1)

Alldegrees1 %>% 
  
  left_join(ocflo.control.mods, by="gene_name") %>%
  
  # PLOT FROM RAW DATA
  mutate(module_identity = factor(module_identity, 
                                  levels = paste0("OC",1:nrow(module_ids)))) %>%
  
  ggplot(aes(x=module_identity, y=kTotal)) +
  # ggplot(aes(x=module_identity, y=kWithin)) +
  
    # geom_hline(yintercept = 45, col="darkgrey", alpha=0.8) +
    # geom_hline(yintercept = 30, col="darkgrey", alpha=0.8) +
    # geom_hline(yintercept = 75, col="darkgrey", alpha=0.8) +
  
    geom_boxplot(fill="lightgrey",
                 alpha=0.6,
                 outlier.color = "grey60",
                 position = "dodge2") +
    
    theme_Publication() +
    scale_colour_Publication() +
    xlab("") +
    ggtitle("") +
  
    ylab("Total connectivity") +
    # ylab("Intramodular connectivity") +
  
    theme(text = element_text(size = 20, colour = 'black'),
          legend.position = "none",
          axis.line.y = element_line(colour = "transparent",size=1)) +
    coord_flip()

Step 5: Export the results

There are several things we could output from our analyses, we decided to report the following in the supplementary file:

  • cluster identity
  • total connectivity
  • pfam annotations enriched in the module

Get pfam enrichments for each module

module_pfams <- list()

bg.genes <- dat[[1]] ## all genes used to make the network

for (i in 1:length(which.labels)) {
  
  # get name of the module
  m <- which.labels[[i]]
  
  which.test <- "pfams"
  
  # save the enrichment results
  module_pfams[[i]] <- 
    module_genes[[m]] %>% 
    check_enrichment(.,
                     bg = dat[[1]],
                     what = which.test,
                     clean = T,
                     expand = T,
                     plot = F)
}
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "No enriched terms found; can't expand."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "No enriched terms found; can't expand."
## Now clean it up
sapply(module_pfams, nrow)
## [[1]]
## [1] 475
## 
## [[2]]
## [1] 12
## 
## [[3]]
## [1] 6
## 
## [[4]]
## [1] 52
## 
## [[5]]
## [1] 5
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## [1] 123
## 
## [[9]]
## NULL
## 
## [[10]]
## [1] 75
## 
## [[11]]
## [1] 0
## 
## [[12]]
## [1] 5
## 
## [[13]]
## [1] 14
## 
## [[14]]
## [1] 22
## 
## [[15]]
## [1] 124
## 
## [[16]]
## [1] 0
for (i in 1:length(module_pfams)) {
  
  if(is.null(nrow(module_pfams[[i]]))) {
    paste(which.labels[[i]],"is null") %>% print()
  } else if(nrow(module_pfams[[i]])==0) {
    paste(which.labels[[i]], "is an empty tibble") %>% print()
  } else {
    
    if(i==1) {
    module.pfams <- module_pfams[[i]]
    } else {
      module.pfams <- rbind(module.pfams, module_pfams[[i]])
    }
  }
}
## [1] "OC6 is null"
## [1] "OC7 is null"
## [1] "OC9 is null"
## [1] "OC11 is an empty tibble"
## [1] "OC16 is an empty tibble"
## change the name of the column
module.pfams <- 
  module.pfams %>% 
  select(gene_name, enriched_in_module=annot_desc) %>% 
  filter(enriched_in_module!="no_desc")

# check the output dataframe
module.pfams %>% head()
## # A tibble: 6 x 2
## # Groups:   gene_name [6]
##   gene_name    enriched_in_module
##   <chr>        <chr>             
## 1 GQ602_000010 MFS_1             
## 2 GQ602_000016 MFS_1; Sugar_tr   
## 3 GQ602_000018 Enterotoxin_a     
## 4 GQ602_000112 zf-C2H2           
## 5 GQ602_000135 Enterotoxin_a     
## 6 GQ602_000344 FAD_binding_3

Let’s make the file

results.gcn <- 
  ocflo.control.mods %>% 
  
  pull(gene_name) %>% 
  
  ## rhythmicity data
  TC6_annotator() %>% 
  select(gene_name = ophio_gene, everything()) %>% 
  
  ## cluster identity
  left_join(ocflo.control.mods, by="gene_name") %>% 
  
  ## add total connectivity data
  left_join(Alldegrees1 %>% select(gene_name,kTotal), by="gene_name") %>% 
  
  ## add the enriched pfam
  left_join(module.pfams, by="gene_name") %>% 
  
  ## Geneset: "rhy24-Ocflo-Okim"
  mutate(rhy_ocflo_okim = ifelse(gene_name %in% gois.tc6[[2]], "yes", "no")) %>% 
  
  ## Geneset: "rhy24-Ocflo|Beau-cluster3"
  mutate(rhy_ocflo_beau_cluster3 = ifelse(gene_name %in% gois.tc6[[3]], "yes", "no")) %>% 
  ## Geneset: "rhy24-Ocflo|Beau-cluster4"
  mutate(rhy_ocflo_beau_cluster4 = ifelse(gene_name %in% gois.tc6[[4]], "yes", "no")) %>% 
  
  ## order the columns and rows
  select(module_identity, kTotal, enriched_in_module, 
         gene_name, gene_desc, 
         rhy_ocflo_okim, rhy_ocflo_beau_cluster3, rhy_ocflo_beau_cluster4,
         everything()) %>% 
  arrange(module_identity, enriched_in_module, desc(kTotal))
  
  
## export it
write.csv(results.gcn,
          file = paste0(path_to_repo, "/results/00_supplementary_files/07_Ocflo_GCN_results.csv"),
          row.names = F)

Step 6: Explore modules

module: OC1

results.gcn %>% 
  filter(module_identity=="OC1") %>% 
  
  # ## summarize the results
  # group_by(inf_v_control, control_rhy24) %>% 
  # summarize(n()) %>% 
  
  ## pull rhythmic genes that are up/down-regulated
  # filter(inf_v_control=="up" & control_rhy24=="yes") %>%
  filter(inf_v_control=="down" & control_rhy24=="yes") %>%
  
  
  ## run enrichments
  pull(gene_name) %>% 
  check_enrichment(.,
                   bg = dat[[1]],
                   what = "pfams",
                   clean = T,
                   expand = T) %>% 
  
  ## run stacked zplots
  pull(gene_name) %>% 
  stacked.zplot_tc6(plot.mean = F, bg.alpha = 0.8, bg.lwd=1.5)
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## [[1]]